package com.github.elazarl.multireducers;
import com.github.elazarl.multireducers.example.ExampleRunner;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableMultiset;
import com.google.common.collect.Multiset;
import com.google.common.io.ByteStreams;
import com.google.common.io.InputSupplier;
import com.google.common.io.Resources;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.mapred.ClusterMapReduceTestCase;
import org.apache.hadoop.mapred.JobConf;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.util.EnumSet;
import java.util.Properties;
import static org.hamcrest.core.Is.is;
import static org.junit.Assert.assertThat;
/**
* Run example job on a mini cluster
*/
public class MultiIT extends ClusterMapReduceTestCase {
@Override
protected void setUp() throws Exception {
File logDir = new File("/tmp/logs");
FileUtils.deleteDirectory(logDir);
assertThat(logDir.mkdirs(), is(true));
System.setProperty("hadoop.log.dir", "/tmp/logs");
Properties properties = new Properties();
properties.setProperty("io.sort.mb", "1");
properties.setProperty("io.sort.spill.percent", "0.0000001");
super.startCluster(true, properties);
}
public void testExample() throws Exception {
ExampleRunner exampleRunner = new ExampleRunner();
JobConf conf = createJobConf();
conf.setNumReduceTasks(10);
exampleRunner.setConf(conf);
final FileContext fc = FileContext.getFileContext(conf);
fc.mkdir(getInputDir(), FsPermission.getDefault(), true);
Path inputFile = new Path(getInputDir(), "input.txt");
int times = 1024 * 1024 + 1;
createInputFile(fc.create(inputFile, EnumSet.of(CreateFlag.CREATE)), times);
assertThat(exampleRunner.run(new String[]{getTestRootDir() + inputFile.toString(), getTestRootDir() + getOutputDir().toString()}),
is(0));
FileStatus[] first = fc.util().listStatus(new Path(getOutputDir(), "first"), new GlobFilter("part-r-*"));
FileStatus[] second = fc.util().listStatus(new Path(getOutputDir(), "second"), new GlobFilter("part-r-*"));
Multiset<String> countFirst = HashMultiset.create();
Multiset<String> countSecond = HashMultiset.create();
fillMapFromFile(fc, first, countFirst);
fillMapFromFile(fc, second, countSecond);
assertThat(ImmutableMultiset.copyOf(countFirst), is(new ImmutableMultiset.Builder<String>().
addCopies("john", 2 * times).
addCopies("dough", times).
addCopies("joe", times).
addCopies("moe", times).
addCopies("prefix_john", 2 * times).
addCopies("prefix_dough", times).
addCopies("prefix_joe", times).
addCopies("prefix_moe", times).build()));
assertThat(ImmutableMultiset.copyOf(countSecond), is(new ImmutableMultiset.Builder<String>().
addCopies("120", times).
addCopies("130", 2*times).
addCopies("180", times).
addCopies("190", times).build()));
}
private void fillMapFromFile(final FileContext fc, FileStatus[] first, Multiset<String> countFirst) throws IOException {
for (final FileStatus status : first) {
Multiset<String> map = MultiJobTest.toMap(new InputSupplier<InputStream>() {
@Override
public InputStream getInput() throws IOException {
return fc.open(status.getPath());
}
});
countFirst.addAll(map);
}
}
private void createInputFile(OutputStream out, int times) throws IOException {
try {
URL inputResource = getClass().getClassLoader().getResource("example_input.txt");
assert(inputResource != null);
byte[] buf = ByteStreams.toByteArray(Resources.newInputStreamSupplier(inputResource));
for (int i=0; i< times; i++) {
out.write(buf);
}
} finally {
out.close();
}
}
}